Experiment 2 : EDA on Housing.csv¶

InĀ [34]:
import pandas as pd 
df = pd.read_csv("Housing.csv")
df.head()
Out[34]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7229300521 20141013T000000 231300.0 2 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

5 rows Ɨ 21 columns

InĀ [35]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long           21613 non-null  float64
 19  sqft_living15  21613 non-null  int64  
 20  sqft_lot15     21613 non-null  int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB
InĀ [36]:
df.describe()
Out[36]:
id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000
mean 4.580306e+09 5.400886e+05 3.370795 2.114757 2079.899736 1.510697e+04 1.494309 0.007542 0.234303 3.409430 7.656873 1788.390691 291.509045 1971.005136 84.402258 98077.939805 47.560053 -122.213896 1986.552492 12768.455652
std 2.876570e+09 3.671268e+05 0.930105 0.770163 918.440897 4.142051e+04 0.539989 0.086517 0.766318 0.650743 1.175459 828.090978 442.575043 29.373411 401.679240 53.505026 0.138564 0.140828 685.391304 27304.179631
min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 5.200000e+02 1.000000 0.000000 0.000000 1.000000 1.000000 290.000000 0.000000 1900.000000 0.000000 98001.000000 47.155900 -122.519000 399.000000 651.000000
25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.000000 3.000000 7.000000 1190.000000 0.000000 1951.000000 0.000000 98033.000000 47.471000 -122.328000 1490.000000 5100.000000
50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.000000 3.000000 7.000000 1560.000000 0.000000 1975.000000 0.000000 98065.000000 47.571800 -122.230000 1840.000000 7620.000000
75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.000000 4.000000 8.000000 2210.000000 560.000000 1997.000000 0.000000 98118.000000 47.678000 -122.125000 2360.000000 10083.000000
max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.000000 5.000000 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 98199.000000 47.777600 -121.315000 6210.000000 871200.000000
InĀ [37]:
df.shape
Out[37]:
(21613, 21)
InĀ [38]:
df.columns
Out[38]:
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')
InĀ [39]:
df = df.drop('date',axis=1)
InĀ [40]:
df = df.drop('id',axis=1)
InĀ [42]:
df['bedrooms'].unique()
Out[42]:
array([ 2,  3,  4,  5,  1,  6,  7,  0,  8,  9, 11, 10, 33])
InĀ [8]:
df['view'].unique()
Out[8]:
array([0, 3, 4, 2, 1])
InĀ [9]:
df.isnull().sum()
Out[9]:
id               0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64
InĀ [10]:
df[df.duplicated()]
#no duplicate data
Out[10]:
id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
3951 1825069031 550000.0 4 1.75 2410 8447 2.0 0 3 4 8 2060 350 1936 1980 98074 47.6499 -122.088 2520 14789
14983 6308000010 585000.0 3 2.50 2290 5089 2.0 0 0 3 9 2290 0 2001 0 98006 47.5443 -122.172 2290 7984
20054 8648900110 555000.0 3 2.50 1940 3211 2.0 0 0 3 8 1940 0 2009 0 98027 47.5644 -122.093 1880 3078
InĀ [11]:
df.drop_duplicates(inplace=True)
InĀ [12]:
df.shape
Out[12]:
(21610, 20)
InĀ [13]:
df.corr()
Out[13]:
id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
id 1.000000 -0.016769 0.001307 0.005091 -0.012241 -0.132101 0.018480 -0.002717 0.011785 -0.023687 0.008093 -0.010859 -0.005085 0.021217 -0.016688 -0.008126 -0.001862 0.020776 -0.002870 -0.138778
price -0.016769 1.000000 0.308385 0.525142 0.702037 0.089663 0.256801 0.266371 0.397413 0.036366 0.667450 0.605570 0.323827 0.054009 0.126495 -0.053195 0.307006 0.021622 0.585387 0.082449
bedrooms 0.001307 0.308385 1.000000 0.515983 0.576696 0.031709 0.175485 -0.006578 0.079445 0.028452 0.357014 0.477633 0.303095 0.154296 0.018705 -0.152807 -0.008935 0.129483 0.391662 0.029244
bathrooms 0.005091 0.525142 0.515983 1.000000 0.754688 0.087750 0.500669 0.063747 0.187891 -0.124940 0.664990 0.685351 0.283814 0.505997 0.050879 -0.203831 0.024590 0.223047 0.568665 0.087191
sqft_living -0.012241 0.702037 0.576696 0.754688 1.000000 0.172830 0.353954 0.103820 0.284641 -0.058768 0.762719 0.876600 0.435054 0.318099 0.055314 -0.199435 0.052521 0.240221 0.756420 0.183285
sqft_lot -0.132101 0.089663 0.031709 0.087750 0.172830 1.000000 -0.005171 0.021601 0.074753 -0.008967 0.113644 0.183527 0.015271 0.053105 0.007677 -0.129611 -0.085680 0.229552 0.144620 0.718556
floors 0.018480 0.256801 0.175485 0.500669 0.353954 -0.005171 1.000000 0.023711 0.029325 -0.263779 0.458151 0.523874 -0.245673 0.489347 0.006156 -0.059025 0.049593 0.125343 0.279861 -0.011250
waterfront -0.002717 0.266371 -0.006578 0.063747 0.103820 0.021601 0.023711 1.000000 0.401993 0.016652 0.082785 0.072080 0.080585 -0.026159 0.092950 0.030277 -0.014272 -0.041904 0.086469 0.030702
view 0.011785 0.397413 0.079445 0.187891 0.284641 0.074753 0.029325 0.401993 1.000000 0.045836 0.251378 0.167659 0.276996 -0.053229 0.103208 0.084838 0.006049 -0.078560 0.280404 0.072577
condition -0.023687 0.036366 0.028452 -0.124940 -0.058768 -0.008967 -0.263779 0.016652 0.045836 1.000000 -0.144655 -0.158212 0.174071 -0.361347 -0.060862 0.002962 -0.014971 -0.106512 -0.092854 -0.003424
grade 0.008093 0.667450 0.357014 0.664990 0.762719 0.113644 0.458151 0.082785 0.251378 -0.144655 1.000000 0.755919 0.168443 0.446966 0.014372 -0.184795 0.114086 0.198347 0.713208 0.119265
sqft_above -0.010859 0.605570 0.477633 0.685351 0.876600 0.183527 0.523874 0.072080 0.167659 -0.158212 0.755919 1.000000 -0.051923 0.423922 0.023233 -0.261163 -0.000823 0.343791 0.731870 0.194060
sqft_basement -0.005085 0.323827 0.303095 0.283814 0.435054 0.015271 -0.245673 0.080585 0.276996 0.174071 0.168443 -0.051923 1.000000 -0.133062 0.071320 0.074781 0.110535 -0.144743 0.200367 0.017260
yr_built 0.021217 0.054009 0.154296 0.505997 0.318099 0.053105 0.489347 -0.026159 -0.053229 -0.361347 0.446966 0.423922 -0.133062 1.000000 -0.224729 -0.346808 -0.148099 0.409395 0.326298 0.070999
yr_renovated -0.016688 0.126495 0.018705 0.050879 0.055314 0.007677 0.006156 0.092950 0.103208 -0.060862 0.014372 0.023233 0.071320 -0.224729 1.000000 0.064388 0.029271 -0.068594 -0.002841 0.007837
zipcode -0.008126 -0.053195 -0.152807 -0.203831 -0.199435 -0.129611 -0.059025 0.030277 0.084838 0.002962 -0.184795 -0.261163 0.074781 -0.346808 0.064388 1.000000 0.267064 -0.564069 -0.279033 -0.147257
lat -0.001862 0.307006 -0.008935 0.024590 0.052521 -0.085680 0.049593 -0.014272 0.006049 -0.014971 0.114086 -0.000823 0.110535 -0.148099 0.029271 0.267064 1.000000 -0.135545 0.048839 -0.086423
long 0.020776 0.021622 0.129483 0.223047 0.240221 0.229552 0.125343 -0.041904 -0.078560 -0.106512 0.198347 0.343791 -0.144743 0.409395 -0.068594 -0.564069 -0.135545 1.000000 0.334592 0.254475
sqft_living15 -0.002870 0.585387 0.391662 0.568665 0.756420 0.144620 0.279861 0.086469 0.280404 -0.092854 0.713208 0.731870 0.200367 0.326298 -0.002841 -0.279033 0.048839 0.334592 1.000000 0.183194
sqft_lot15 -0.138778 0.082449 0.029244 0.087191 0.183285 0.718556 -0.011250 0.030702 0.072577 -0.003424 0.119265 0.194060 0.017260 0.070999 0.007837 -0.147257 -0.086423 0.254475 0.183194 1.000000
InĀ [45]:
# Univariate Analysis : analyze distribution of target variable(price)
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.figure()
sns.histplot(df['price'],kde=True)
plt.title("Price Distribution")
plt.show()
No description has been provided for this image
InĀ [67]:
# outlier detection : Checking extreme values in price 
plt.figure()
sns.boxplot(x=df['price'])
plt.title("Price boxplot")
plt.show()
No description has been provided for this image
InĀ [53]:
# Correlation matrix : checking relationship between features.

import matplotlib.pyplot as plt 
import seaborn as sns 
plt.figure(figsize=(17,12))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
No description has been provided for this image
InĀ [54]:
# Feature vs price Analysis
plt.figure()
sns.scatterplot(x='sqft_living',y='price',data=df)
plt.show()
No description has been provided for this image
InĀ [15]:
#conclusion- It is an imbalanced dataset
df.condition.value_counts().plot(kind='bar') 
plt.xlabel("grade")## Visualization
plt.ylabel("Count")
plt.show()
No description has been provided for this image
InĀ [56]:
df.columns
Out[56]:
Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')
InĀ [58]:
sns.histplot(df['sqft_living'],kde=True)
Out[58]:
<Axes: xlabel='sqft_living', ylabel='Count'>
No description has been provided for this image
InĀ [59]:
# univariate, bivariate, multivariate analysis
sns.pairplot(df)
Out[59]:
<seaborn.axisgrid.PairGrid at 0x1c2b4b06270>
No description has been provided for this image
InĀ [65]:
sns.catplot(x='bedrooms',y='price',data=df,kind="box")
Out[65]:
<seaborn.axisgrid.FacetGrid at 0x1c2fb14a710>
No description has been provided for this image